{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data folder is /media/luca/DATA/DATASETS\n",
      "Experiment save directory is  /media/luca/DATA/EXPERIMENTS\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import experiment_manager as em\n",
    "import far_ho as far"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4\n"
     ]
    }
   ],
   "source": [
    "far.utils._check()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "try: ss.close()\n",
    "except: pass\n",
    "tf.reset_default_graph()\n",
    "ss = tf.InteractiveSession()\n",
    "v1 = tf.Variable([1.,3])\n",
    "\n",
    "v2 = tf.Variable([[-1., -2], [1., 0.]])\n",
    "\n",
    "lmbd = far.get_hyperparameter('lambda', \n",
    "                              initializer=tf.ones_initializer,\n",
    "                             shape=v2.get_shape(), scalar=False)\n",
    "\n",
    "cost = tf.reduce_mean(v1**2) + tf.reduce_sum(lmbd*v2**2)\n",
    "\n",
    "io_optim = far.AdamOptimizer(far.get_hyperparameter('lr', 0.01), epsilon=1.e-6) # same epsilon of rfho\n",
    "# io_optim = far.GradientDescentOptimizer(0.1)\n",
    "\n",
    "# io_optim_dict = io_optim.minimize(cost) \n",
    "\n",
    "oo = tf.reduce_mean(v1*v2)\n",
    "optim_oo = tf.train.AdamOptimizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "farho = far.HyperOptimizer() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "run = farho.minimize(oo, optim_oo, cost, io_optim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tf.global_variables()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# far.utils.hyperparameters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([[ 1.,  1.],\n",
       "        [ 1.,  1.]], dtype=float32), 0.0099999998]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tf.global_variables_initializer().run()\n",
    "ss.run(far.utils.hyperparameters())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "optim_dict = list(farho.hypergradient._optimizer_dicts)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# optim_dict.dynamics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### check the dynamics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2.9802322e-07, 0.0, 0.0, 2.0647654e-07, 0.0, 0.0, 0.0, 0.0]\n",
      "\n",
      "[2.3841858e-07, 0.0, 2.3283064e-10, 2.0647654e-07, 0.0, 0.0, 0.0, 0.0]\n",
      "\n",
      "[1.7881393e-07, 0.0, 0.0, 2.0647654e-07, 8.4293696e-08, 3.7252903e-09, 0.0, 0.0]\n",
      "\n",
      "[1.1920929e-07, 0.0, 0.0, 1.4600097e-07, 0.0, 7.4505806e-09, 0.0, 0.0]\n",
      "\n",
      "[1.1920929e-07, 0.0, 3.7252903e-09, 1.4600097e-07, 0.0, 0.0, 0.0, 0.0]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for _ in range(5):\n",
    "#     print(ss.run(io_optim._get_beta_accumulators()))\n",
    "#     print(ss.run(io_optim._lr_t))\n",
    "    _res_dy = ss.run(optim_dict.dynamics)\n",
    "    _res_1 = ss.run(optim_dict.iteration)\n",
    "#     print(_res_1)\n",
    "#     print(_res_dy)\n",
    "    \n",
    "#     print()\n",
    "    print([np.linalg.norm(a - b) for a, b in zip(_res_dy, _res_1)])\n",
    "#     print()\n",
    "    print()\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The difference is due to the diferent positioning of the $\\varepsilon$  (inside the square root)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "run(10, _skip_hyper_ts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "weights_optim_far = ss.run([v1, v2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([[ 1.,  1.],\n",
       "        [ 1.,  1.]], dtype=float32), 0.0099999998]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ss.run(far.utils.hyperparameters())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([[  1.74622983e-09,  -1.86264515e-09],\n",
       "        [ -1.74622983e-09,   0.00000000e+00]], dtype=float32), 11.965221]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rev_hypergrads = ss.run(far.utils.hypergradients())\n",
    "rev_hypergrads"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.,  1.],\n",
       "       [ 1.,  1.]], dtype=float32)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ss.run(lmbd)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# check with rfho"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Environment variable RFHO_EXP_FOLDER not found. Current directory will be used\n",
      "Experiment save directory is  /media/luca/DATA/Progs/FAR-HO/tests/Experiments\n",
      "Environment variable RFHO_DATA_FOLDER not found. Variables HELP_WIN and HELP_UBUNTU contain info.\n",
      "Data folder is /media/luca/DATA/Progs/FAR-HO/tests\n"
     ]
    }
   ],
   "source": [
    "import rfho as rf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"vectorization:0\", shape=(6,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"augmented_variables/0/vectorization:0\", shape=(6,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"augmented_variables_1/1/vectorization:0\", shape=(6,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"vectorization_1:0\", shape=(18,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Reshape_5:0\", shape=(2, 2), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Reshape_6:0\", shape=(2, 2), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Reshape_7:0\", shape=(2,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Reshape_8:0\", shape=(2, 2), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Reshape_9:0\", shape=(2,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Reshape_10:0\", shape=(2,), dtype=float32)\n"
     ]
    }
   ],
   "source": [
    "w, c, co = rf.vectorize_model([v1, v2], cost, oo, augment=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Reshape:0\", shape=(2, 2), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Assign_1:0\", shape=(2, 2), dtype=float32_ref)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Reshape_1:0\", shape=(2,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Assign_2:0\", shape=(2,), dtype=float32_ref)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Reshape_2:0\", shape=(2,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Assign_3:0\", shape=(2,), dtype=float32_ref)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Reshape_3:0\", shape=(2, 2), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Assign_4:0\", shape=(2, 2), dtype=float32_ref)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Reshape_4:0\", shape=(2, 2), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Assign_5:0\", shape=(2, 2), dtype=float32_ref)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Reshape_5:0\", shape=(2,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"Adam_2/Assign_6:0\", shape=(2,), dtype=float32_ref)\n"
     ]
    }
   ],
   "source": [
    "lr = tf.Variable(0.01, name='lr')\n",
    "dyn = rf.AdamOptimizer.create(w, lr, loss=c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# w.var_list()[0].assign(1.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Reshape:0\", shape=(2, 2), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Assign:0\", shape=(2, 2), dtype=float32_ref)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Reshape_1:0\", shape=(2, 2), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Assign_1:0\", shape=(2, 2), dtype=float32_ref)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Reshape_2:0\", shape=(2,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Assign_2:0\", shape=(2,), dtype=float32_ref)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Reshape_3:0\", shape=(2, 2), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Assign_3:0\", shape=(2, 2), dtype=float32_ref)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Reshape_4:0\", shape=(2,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Assign_4:0\", shape=(2,), dtype=float32_ref)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Reshape_5:0\", shape=(2,), dtype=float32)\n",
      "Warning: creating nodes at tf.Session runtime: node Tensor(\"w_history_ops/Assign_5:0\", shape=(2,), dtype=float32_ref)\n"
     ]
    }
   ],
   "source": [
    "hyperg = rf.HyperOptimizer(dyn, {co: [lmbd, lr]}, rf.ReverseHG)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hyperg.initialize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "hgs = hyperg.run(10, val_feed_dict_suppliers={co:lambda: None},\n",
    "                 _debug_no_hyper_update=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rfho\n",
      "[array([[  1.68111001e-06,   1.34526636e-06],\n",
      "       [ -1.68111001e-06,   0.00000000e+00]], dtype=float32), 11.965094]\n",
      "far\n",
      "[array([[  1.74622983e-09,  -1.86264515e-09],\n",
      "       [ -1.74622983e-09,   0.00000000e+00]], dtype=float32), 11.965221]\n"
     ]
    }
   ],
   "source": [
    "print('rfho')\n",
    "print(ss.run(hyperg.hyper_gradients.hyper_gradient_vars))\n",
    "print('far')\n",
    "print(ss.run(far.utils.hypergradients()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Difference are due to the different implementation of the two methods. far.AdamOptimizer uses the correct dynamics for updating the weights and the ''incorrect'' one, i.e. with $\\sqrt{v - \\varepsilon}$ instead of $\\sqrt{v} + \\varepsilon$,   for computing the hypergradients while rfho.Adam uses the ''incorrect'' also for the updates."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.90036476  2.90011191 -0.90035355 -1.90016878  0.90035355  0.        ]\n"
     ]
    }
   ],
   "source": [
    "print(ss.run(w.var_list()[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([ 0.90035141,  2.9001112 ], dtype=float32),\n",
       " array([[-0.90035063, -1.90016854],\n",
       "        [ 0.90035063,  0.        ]], dtype=float32)]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "weights_optim_far"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### CHECK WITH FORWARD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "try: ss.close()\n",
    "except: pass\n",
    "tf.reset_default_graph()\n",
    "ss = tf.InteractiveSession()\n",
    "v1 = tf.Variable([1.,3])\n",
    "\n",
    "v2 = tf.Variable([[-1., -2], [1., 0.]])\n",
    "\n",
    "lmbd = far.get_hyperparameter('lambda', \n",
    "                              initializer=tf.ones_initializer,\n",
    "                             shape=v2.get_shape(), scalar=True)\n",
    "\n",
    "cost = tf.reduce_mean(v1**2) + tf.reduce_sum(lmbd*v2**2)\n",
    "\n",
    "io_optim = far.AdamOptimizer(far.get_hyperparameter('lr', 0.01), epsilon=1.e-6) # same epsilon of rfho\n",
    "# io_optim = far.GradientDescentOptimizer(0.1)\n",
    "\n",
    "# io_optim_dict = io_optim.minimize(cost) \n",
    "\n",
    "oo = tf.reduce_mean(v1*v2)\n",
    "optim_oo = tf.train.AdamOptimizer()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "farho_fw = far.HyperOptimizer(far.ForwardHG())\n",
    "run_fw = farho_fw.minimize(oo, optim_oo, cost, io_optim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tf.global_variables_initializer().run()\n",
    "run_fw(10, _skip_hyper_ts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1.362591e-09, -1.0128521e-09, -1.362591e-09, 0.0, 11.965221]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ss.run(far.utils.hypergradients())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([[  1.74622983e-09,  -1.86264515e-09],\n",
       "        [ -1.74622983e-09,   0.00000000e+00]], dtype=float32), 11.965221]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rev_hypergrads"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some very small differences... but acceptable.. (derivative w.r.t. learning rate is the same!) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
